
*Clearing all pre-existing data and matrices from Stata
clear
clear matrix
clear mata
set more off
*Defining parameters for maximum number of variables and display
set maxvar 32767, permanently
set scrollbufsize 640000
*Adjusting graphical parameters
grstyle init
grstyle set plain

* figure output specifications
local fig_w = 5
local fig_res = `fig_w' * 600 
local fig_type tif
local figure_spec as(`fig_type') width(`fig_res')
local fig_ext "`fig_type'"

* set density estimation bandwidth
global kdensity_bw = 0.3


*Turn graphics on/off
set graphics on

global pathGJDdhs "${asset_densities}"


/**************************************************************************************************************************
																														  
 Description: this do-file reads individual IDHS files and performs PCA analysis on asset ownership to create			  
              a wealth variable and splits the sample into wealth quintiles. Here, the PCA variable computed
			  on agricultural assets only. 							  
 Input: individual IDHS files, downloaded from the IPUMS-DHS website (must all be placed on the same folder)             
 Output: 																  
	   - figures: (1) estimated kernel density of the PCA wealth score (agricultural)   
				
 Authors: Caterina Soto Vieira (c.soto-vieira@lse.ac.uk) 																  	
          Celine Zipfel (c.p.zipfel@lse.ac.uk)																			  
		  Gabriel Leite Mariante (g.leite-mariante@lse.ac.uk)															  
 Date: September 2020	(updated October 2021)
 Stata Version: MP 17.0
																														  
**************************************************************************************************************************/														


*Define a global list with all possible agricultural assets which may be present in IDHS samples	
global masteragassetlist "aglandyn cashcropyn drawncart tractor thresher graingr generatyn generatnum hmmill ploughyn whbarrowyn axehoeyn borehole wtrpumpyn farmimpyn livestockyn cattleyn cattlenum cowbullyn bullockyn exoticctyn horsemuleyn donkeyyn horseyn sheepyn goatyn sheepgoatyn pigyn pigimpyn camelyn catcamyn buffaloyn rabbityn otheranyn bullyn yakyn gpigyn chickyn chickimpyn"


*Create directory to save the various PCA graphs
cap mkdir "$pathGJDdhs/Output"


*Create local with all IDHS country files, which must be placed in the same folder	
local files : dir "$pathGJDdhs/Data" files "*.dta"
di `files'


*Loop over all country files
foreach file of local files{
	
	*Load idhs country file
	cd "$pathGJDdhs/Data"
	use `file', clear
	
	*Each IDHS country file contains all years combined, we run the PCA analysis on each year separately
	*Create an auxiliary variable equal to 1,2,3, ... where each value is one IDHS year
	egen aux_years = group(year)
	
	*Sumarising this variable gives us access to first and last year through the auxiliary variable
	quietly sum aux_years, d
	
	*Loop over each year using max and min from above
	forvalues i = `r(min)'/`r(max)'{
		
			
		*Keep only observations of the current year
		keep if aux_year == `i'
		*Create a local with the year to be used in plots and names of files
		local year_sample `=year[1]'
		
		*Create two locals from labels of country variable: one for display purposes, the other for naming purposes without spaces. Remove problematic character ` in all of them.
		decode country, gen(country_string)
		replace country_string = subinstr(country_string,"'","",.)
		local country_sample `=proper(country_string[1])'
		local country_sample_nos `=subinstr(proper(country_string[1])," ","",.)'
		
		*Display wave being used (just for debugging purposes)
		display "Current wave: `country_sample' `year_sample'"
		
		
		*Create locals with empty variable lists for all agricultural assets to filled up in the next step
		local agassetlist ""

		* Loop over the global asset-list from the beginning of the file and 
		* fill up the lists by checking which of the possible assets actually exists in each year.
		* Only consider variables which are not missing for at least 95% of the sample, to avoid a large number of missing values in the PCA afterwards
		
			
		* Agricultural assets
		foreach i of global masteragassetlist{
			capture confirm variable `i'
			if !_rc{
				quietly sum `i'
				if r(N) > 0.95*_N{
					local agassetlist "`agassetlist' `i'"
				}
			}
		}
		
			
		*Rename individual identifier for samples in which it has a different name
		cap rename mcaseid caseid
		
		*Keep only one observation per household
		duplicates drop hhid, force
		
		
		***** PCA on agricultural assets only *****
		
		*Count the number of existing assets in the list of agricultural assets
		local ag_check : word count `agassetlist'
		display "Number of agricultural assets = `ag_check'"
		
		*Run the PCA analysis on agricultural assets if there are at least 4 distinct assets
		if `ag_check' > 4{
			
			*Loop over all the existing agriultural assets, which are all binary, to create binary indicators to be used in the PCA analysis
			foreach var of local agassetlist{
				*For each asset, tabulate all possible variables with and without their labels
				tab `var'
				tab `var', nolab
				*Recode meaningless codes to missing (binary assets should be 1 or 0)
				recode `var' (6 7 8 9 = .)
				*Generate an indicator for each category (in the case of binary assets, only "no" and "yes")
				tab `var', gen(`var'_yn)
				*Drop the indicator for "no"
				cap drop `var'_yn1
			}
			
			*Run PCA analysis using the existing agricultural assets on rural households
			pca `agassetlist' if urban == 2
			
			*Generate wealth score based on the result of the PCA
			predict agassets_pca_idhs
			
			*Create quintiles of agricultural assets
			xtile [w=perweight] agassets_pca_idhs_q = agassets_pca_idhs if urban == 2, nq(5)
			
			* make density plots
			preserve
				*Keep only rural households, for plotting
				keep if urban == 2
				
				*Drop if agricultural PCA score is missing, or if household is below p0.1 or p99.9
				drop if missing(agassets_pca_idhs)	
				_pctile agassets_pca_idhs, p(0.1 99.9)
				drop if agassets_pca_idhs < r(r1) | agassets_pca_idhs > r(r2)
				
				*Create quintiles of agricultural asset 
				pctile [w=perweight] quint = agassets_pca_idhs, nq(5)
				tab quint
				
				*Create auxiliary variable with the threshold of wealth value of each quintile, for plotting purposes
				//bys quint: gen aux = quint if _n == 1
				gen aux = quint
				
				*Sorting the data by aux, so that the first 4 observations are the threshold for each agricultural wealth quintile	
				sort aux
				
				* estimate and plot kernel density of the PCA wealth score, and add vertical lines at the thresholds for each quintile
					
					tw kdensity agassets_pca_idhs [aw=perweight], xline(`=aux[1]' `=aux[2]' `=aux[3]' `=aux[4]', lpattern(dash) lcolor(gs10)) ///
						bw(${kdensity_bw}) ///
						title(" ") ///
						xtitle("PCA score") ytitle("Density") ///
						note("")
						
						* output figure as .tif
					graph export "$pathGJDdhs\Output\idhs_kdensity_pca_wealth_index_`country_sample_nos'_`year_sample'_agassets.`fig_ext'", replace `figure_spec'
										
			restore
			
		}

			
		*Re-open file with all years for next loop iteration on the current country
		use `file', clear
		
		*Re-create grouped variable for the sample year
		egen aux_years = group(year)			
	}
		
}

